Q1.1 - Joins

Read in the CSV files and packages

Packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

q1 - Read csv’s

card_data <- read_csv("data/CardBase.csv") %>% 
  clean_names()
## Rows: 500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Card_Number, Card_Family, Cust_ID
## dbl (1): Credit_Limit
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
customer_data <- read_csv("data/CustomerBase.csv") %>% 
  clean_names()
## Rows: 5674 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Cust_ID, Customer_Segment, Customer_Vintage_Group
## dbl (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fraud_data <- read_csv("data/FraudBase.csv") %>% 
  clean_names()
## Rows: 109 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Transaction_ID
## dbl (1): Fraud_Flag
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
transaction_data <- read_csv("data/TransactionBase.csv") %>% 
  clean_names()
## Rows: 10000 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Transaction_ID, Transaction_Date, Credit_Card_ID, Transaction_Segment
## dbl (1): Transaction_Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Q2-

join card details and customer details by card id- so that all record of card details and any matching records in customer details are kept.

customer_data = 5674x4 card_data = 500x4

expected output= (500(plus matches if any)) x (4 + (3 from customer_data))

result shows a 1:1 relationship for customer_id

left_join(card_data, customer_data, by = c("cust_id" = "cust_id"))

Question 3-

Join fraud details with transaction details keep - all rows in both tables

transaction_data = 10,000 x 5 fraud_data = 109 x 2

expected output - (10,000 + additional matches) x 2+4

results- tell you that 1:1 relationship between transaction_id all transactions in the fraud data base are accounted for in the transaction_data

full_join(fraud_data, transaction_data, by = c("transaction_id" = "transaction_id"))

Question 4

Join card details with transaction details keep - rows from first match the second are returned only return results from 1st table once

card_data = 500x4 transaction_data = 10000 x 5 relationship - card:transaction = 1:many

result - 10000 x 3+5

right_join(card_data, transaction_data, by = c("card_number" = "credit_card_id"), keep = FALSE)
card_data
transaction_data
right_join(transaction_data, card_data, by = c("credit_card_id" = "card_number"), keep = FALSE)

1.2

Question 5

Read in data sets

hats <- read_csv("data/hat_observations.csv")
## Rows: 100 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): observation
## dbl (4): day, month, year, observation_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hat_separated <- hats %>% 
  separate(observation,
           into = c("hat_colour", "hat_type"),
           sep = ",")

hat_separated

Question 6

hat_dates <- hat_separated %>% 
  unite(date,
        c(day, month, year),
        sep = "/") %>% 
  filter(hat_type == "beret") %>% 
  slice_max(observation_count)

hat_dates

Extension

Q1

cand_number = cust_id, card_number = credit_card_id, transaction_id = transaction_id

left_join(card_data, customer_data, by = c("cust_id" = "cust_id")) %>% 
  left_join(transaction_data, by = c("card_number" = "credit_card_id")) %>% 
  right_join(fraud_data, by = c("transaction_id" = "transaction_id"))

Question 2

exam_score <- read_csv("data/exam_scores.csv")
## Rows: 200 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): id, exam_Q1, exam_Q2, exam_Q3, exam_Q4, exam_Q5, exam_Q6, exam_Q7,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Read in exam_scores and transform it into long format with two new columns exam_question and score. Then, using separate and select, remove superfluous information from the values in exam_question

# Don't need separate and select?
exam_score %>% 
  pivot_longer(cols = starts_with("exam"),
               names_to = "exam_question",
               names_prefix = "exam_Q",
               values_to = "score")
# Completing as per the question
exam_score %>% 
  pivot_longer(cols = starts_with("exam"),
               names_to = "exam_question",
               values_to = "score") %>% 
  separate(exam_question, into = c("temp", "exam_question"),
           sep = "Q") %>% 
  select(-temp)